library(xgboost)
library(randomForest)
library(tidyverse)
library(lubridate)
source('functions.r')
load("Table_construction.Rdata")
features = features %>%
# Add other useful information:
inner_join(
data_before %>%
select(person_id, screening_date, people) %>%
unnest() %>%
select(person_id, screening_date, race, sex, name),
by = c("person_id","screening_date")
) %>%
inner_join(features_on, by = c("person_id","screening_date")) %>%
inner_join(outcomes, by = c("person_id","screening_date")) %>%
# Create as many features as possible:
mutate(
raw_score = `Risk of Violence_raw_score`, # Adjust for violent/general
decile_score = `Risk of Violence_decile_score`, # Adjust for violent/general
p_juv_fel_count = pmin(p_juv_fel_count,2),
p_felprop_violarrest = pmin(p_felprop_violarrest,5),
p_murder_arrest = pmin(p_murder_arrest,3),
p_felassault_arrest = pmin(p_felassault_arrest,3),
p_misdemassault_arrest = pmin(p_misdemassault_arrest,3),
#p_famviol_arrest = pmin(p_famviol_arrest,3),
p_sex_arrest = pmin(p_sex_arrest,3),
p_weapons_arrest = pmin(p_weapons_arrest,3),
p_n_on_probation = pmin(p_n_on_probation,5),
p_current_on_probation = pmin(p_current_on_probation,5),
p_prob_revoke = pmin(p_prob_revoke,5),
race_black = if_else(race=="African-American",1,0),
race_white = if_else(race=="Caucasian",1,0),
race_hispanic = if_else(race=="Hispanic",1,0),
race_asian = if_else(race=="Asian",1,0),
race_native = if_else(race=="Native American",1,0), # race == "Other" is the baseline
# Subscales:
vio_hist = p_juv_fel_count+
p_felprop_violarrest+
p_murder_arrest+
p_felassault_arrest+
p_misdemassault_arrest+
#p_famviol_arrest+
p_sex_arrest+
p_weapons_arrest,
history_noncomp=p_prob_revoke+
p_probation+p_current_on_probation+
p_n_on_probation,
# Filters (TRUE for obserations to keep)
filt1 = `Risk of Recidivism_decile_score` != -1, `Risk of Violence_decile_score` != -1, # Filter 1
filt3 = !is.na(current_offense_date), # Filter 3
filt4 = current_offense_date <= current_offense_date_limit, # Filter 4
filt5 = p_current_age > 18 & p_current_age <= 70 # Filter 5
)
features_f_age = features %>%
filter(filt1,filt5) %>%
select(p_current_age, raw_score)
lb_age = features_f_age %>%
group_by(p_current_age) %>%
#arrange(raw_score, .by_group=TRUE) %>%
arrange(raw_score) %>%
top_n(n=-1, wt=raw_score) # Fit lower bound on smallest value
mdl_age = lm(raw_score ~
I(p_current_age^4) +
I(p_current_age^3) +
I(p_current_age^2) +
p_current_age,
data=lb_age)
# More precision for paper
summary(mdl_age)
##
## Call:
## lm(formula = raw_score ~ I(p_current_age^4) + I(p_current_age^3) +
## I(p_current_age^2) + p_current_age, data = lb_age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.57815 -0.00957 0.00722 0.03974 0.13353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.554e+00 8.206e-01 1.894 0.061779 .
## I(p_current_age^4) 4.078e-07 2.904e-07 1.404 0.164074
## I(p_current_age^3) -8.818e-05 5.109e-05 -1.726 0.088148 .
## I(p_current_age^2) 7.437e-03 3.227e-03 2.304 0.023731 *
## p_current_age -3.161e-01 8.631e-02 -3.662 0.000441 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09085 on 82 degrees of freedom
## Multiple R-squared: 0.9799, Adjusted R-squared: 0.979
## F-statistic: 1001 on 4 and 82 DF, p-value: < 2.2e-16
print("Coefficients:")
## [1] "Coefficients:"
sprintf("%.20e",mdl_age$coefficients) # More precision for paper
## [1] "1.55398886995612506290e+00" "4.07808994844784392896e-07"
## [3] "-8.81767707692677913051e-05" "7.43693932415081908338e-03"
## [5] "-3.16106532728162026302e-01"
## Add f(age) to features
features = features %>%
mutate(
f_age = predict(mdl_age, newdata=data.frame(p_current_age=p_current_age)),
raw_score__f_age = raw_score - f_age,
filt6 = raw_score >= f_age
)
## Age polynomial plot
xmin = 18
xmax = 70
xx = seq(xmin,xmax, length.out=1000)
ggplot()+
geom_point(aes(x=p_current_age, raw_score), color="#619CFF",alpha=.3, data=features_f_age) +
geom_line(aes(x=xx, predict(mdl_age, newdata=data.frame(p_current_age=xx))),color="#F8766D") +
theme_bw()+
xlim(xmin,xmax)+
xlab("Age at COMPAS screening date") +
ylab("Violent score") +
theme(text = element_text(size=12),
axis.text=element_text(size=12),
legend.position="none")
ggsave("Figures/age_LB_violent.pdf",width = 3.5, height = 2.5, units = "in")
### Compute lower bound of raw_score__f_age for each vio_hist value:
# Apply filters:
features_vio_hist = features %>%
filter(filt1,filt3) %>% # Careful, filter 6 applied later since we need these points for plot
select(vio_hist, raw_score__f_age,filt6)
# Compute lower bound
lb_vio_hist = features_vio_hist %>%
filter(filt6) %>% # Now apply filter 6
select(-filt6) %>%
group_by(vio_hist)%>%
top_n(n=-1,wt=raw_score__f_age)%>%
rename(g_vio_hist = raw_score__f_age) %>%
#arrange(vio_hist,.by_group=TRUE) %>%
arrange(vio_hist) %>%
ungroup()
# Use last value of g_vio_hist if vio_hist > vio_hist_cutoff
vio_hist_cutoff = 6
lb_vio_hist_cutoff = lb_vio_hist$g_vio_hist[lb_vio_hist$vio_hist==vio_hist_cutoff]
lb_vio_hist = lb_vio_hist %>%
mutate(g_vio_hist = if_else(vio_hist < vio_hist_cutoff, g_vio_hist, lb_vio_hist_cutoff))
# Add g(vio_hist) to features
features = features %>%
left_join(lb_vio_hist, by="vio_hist") %>%
mutate(raw_score__f_age__g_vio_hist = raw_score__f_age - g_vio_hist)
lb_vio_hist_plot =
data.frame(vio_hist_xx = seq(0,13,length.out=10000)) %>%
mutate(vio_hist = round(vio_hist_xx)) %>%
left_join(lb_vio_hist,by="vio_hist")
ggplot() +
geom_point(data=features_vio_hist,aes(x=vio_hist,y=raw_score__f_age,color=filt6),alpha=.5) +
geom_line(data=lb_vio_hist_plot,aes(x=vio_hist_xx,y=g_vio_hist,color="lb"))+
theme_bw()+
xlab("Sum of History of Violence Components") +
ylab(expression(Violent~score~-~f[viol~age])) +
theme(text = element_text(size=12),
axis.text=element_text(size=12),
legend.position="top") +
scale_color_manual(name=element_blank(),
breaks=c("TRUE", "FALSE","lb"),
labels=c(expression(Above~f[viol~age]), expression(Below~f[viol~age]),expression(g[viol~hist])),
values=c("TRUE"="#619CFF","FALSE"="#00BA38","lb"="#F8766D"))
rm(lb_vio_hist_plot)
ggsave("Figures/vio_hist_LB_violent.pdf",width = 5, height = 3, units = "in")
features %>%
filter(filt1,filt3) %>%
select(history_noncomp, raw_score__f_age__g_vio_hist, filt6) %>%
ggplot() +
geom_point(aes(x=history_noncomp,y=raw_score__f_age__g_vio_hist,color=filt6),alpha=.5) +
theme_bw()+
xlab("Sum of History of Noncompliance Components") +
ylab(expression(Violent~score~-~f[viol~age]~-~g[viol~hist])) +
theme(text = element_text(size=12),
axis.text=element_text(size=12),
legend.position="top") +
scale_color_manual(name=element_blank(),
breaks=c("TRUE", "FALSE"),
labels=c(expression(Above~f[viol~age]), expression(Below~f[viol~age])),
values=c("TRUE"="#619CFF","FALSE"="#00BA38"))
ggsave("Figures/hist_noncomp_LB_violent.pdf",width = 5, height = 3, units = "in")
There are a few groups of predictors we will use: * Group 1: without using age variables or race * Group 2: without using age variables but with race * Group 3: without using race but with age variables * Group 4: using age variables and race
#### Generic stuff (applies to all models)
## Filter rows
features_filt = features %>%
filter(filt1, filt3)
## Set parameters (each combination will be run)
param <- list(objective = "reg:linear",
eval_metric = "rmse",
eta = c(.05,.1),
gamma = c(.5, 1),
max_depth = c(2,5),
min_child_weight = c(5,10),
subsample = c(1),
colsample_bytree = c(1)
)
# svm
param_svm = list(
type = 'eps-regression',
cost = c(0.5,1,2),
epsilon = c(0.5,1,1.5),
gamma_scale = c(0.5,1,2)
)
res_rmse = data.frame(Group = 1:4, lm = NA, xgb = NA, rf = NA, svm = NA)
### Create group 1 training data
## Select features and round count features
train = features_filt %>%
select(
#p_current_age,
p_age_first_offense,
p_juv_fel_count,
p_felprop_violarrest,
p_murder_arrest,
p_felassault_arrest,
p_misdemassault_arrest,
#p_famviol_arrest,
p_sex_arrest,
p_weapons_arrest,
p_n_on_probation,
p_current_on_probation,
p_prob_revoke,
raw_score__f_age)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__f_age) %>% as.matrix(),
"label" = train %>% select(raw_score__f_age) %>% as.matrix()
)
mdl_lm = lm(raw_score__f_age ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__f_age ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9576 -0.3445 -0.1178 0.2532 3.7943
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.8451607 0.0153243 55.152 < 2e-16 ***
## p_age_first_offense -0.0065319 0.0004888 -13.362 < 2e-16 ***
## p_juv_fel_count 0.1306356 0.0196060 6.663 2.84e-11 ***
## p_felprop_violarrest 0.1124162 0.0061483 18.284 < 2e-16 ***
## p_murder_arrest 0.1027832 0.0481811 2.133 0.0329 *
## p_felassault_arrest 0.1810567 0.0114713 15.783 < 2e-16 ***
## p_misdemassault_arrest 0.1669203 0.0107051 15.593 < 2e-16 ***
## p_sex_arrest 0.1747182 0.0407972 4.283 1.87e-05 ***
## p_weapons_arrest 0.1033170 0.0158291 6.527 7.07e-11 ***
## p_n_on_probation 0.0787288 0.0051618 15.252 < 2e-16 ***
## p_current_on_probation 0.1522874 0.0260538 5.845 5.24e-09 ***
## p_prob_revoke 0.1443481 0.0195092 7.399 1.50e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5048 on 9030 degrees of freedom
## Multiple R-squared: 0.32, Adjusted R-squared: 0.3191
## F-statistic: 386.3 on 11 and 9030 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==1,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__f_age) # ADJUST GROUP
set.seed(46)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 15
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "1"
## max_depth "5"
## min_child_weight "10"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__f_age
res_rmse[res_rmse$Group==1,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab("Violent score - f(age)") +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(55656)
mdl_rf = randomForest(
formula = raw_score__f_age ~ .,
data = train
)
res_rmse[res_rmse$Group==1,]$rf = rmse(mdl_rf$predicted, train$raw_score__f_age) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__f_age ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 1
## type "eps-regression"
## cost "0.5"
## epsilon "0.5"
## gamma_scale "0.5"
## gamma "0.04166667"
res_rmse[res_rmse$Group==1,]$svm = rmse(mdl_svm$fitted, train$raw_score__f_age) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
### Create group 2 training data
## Select features and round count features
train = features_filt %>%
select(
#p_current_age,
p_age_first_offense,
p_juv_fel_count,
p_felprop_violarrest,
p_murder_arrest,
p_felassault_arrest,
p_misdemassault_arrest,
#p_famviol_arrest,
p_sex_arrest,
p_weapons_arrest,
p_n_on_probation,
p_current_on_probation,
p_prob_revoke,
race_black,
race_white,
race_hispanic,
race_asian,
race_native, # race == "Other" is the baseline
raw_score__f_age)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__f_age) %>% as.matrix(),
"label" = train %>% select(raw_score__f_age) %>% as.matrix()
)
mdl_lm = lm(raw_score__f_age ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__f_age ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9759 -0.3321 -0.1098 0.2460 3.6383
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6137104 0.0260561 23.553 < 2e-16 ***
## p_age_first_offense -0.0044461 0.0004968 -8.949 < 2e-16 ***
## p_juv_fel_count 0.1212941 0.0192844 6.290 3.33e-10 ***
## p_felprop_violarrest 0.1088356 0.0060435 18.009 < 2e-16 ***
## p_murder_arrest 0.1060919 0.0473390 2.241 0.025 *
## p_felassault_arrest 0.1767698 0.0112734 15.680 < 2e-16 ***
## p_misdemassault_arrest 0.1668764 0.0105170 15.867 < 2e-16 ***
## p_sex_arrest 0.1659144 0.0400788 4.140 3.51e-05 ***
## p_weapons_arrest 0.0929469 0.0155661 5.971 2.45e-09 ***
## p_n_on_probation 0.0736827 0.0050787 14.508 < 2e-16 ***
## p_current_on_probation 0.1424988 0.0256041 5.565 2.69e-08 ***
## p_prob_revoke 0.1351758 0.0191733 7.050 1.92e-12 ***
## race_black 0.2747839 0.0225159 12.204 < 2e-16 ***
## race_white 0.1209223 0.0227808 5.308 1.13e-07 ***
## race_hispanic 0.0267705 0.0273158 0.980 0.327
## race_asian -0.0244794 0.0753349 -0.325 0.745
## race_native 0.1538104 0.0962585 1.598 0.110
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4958 on 9025 degrees of freedom
## Multiple R-squared: 0.3442, Adjusted R-squared: 0.3431
## F-statistic: 296.1 on 16 and 9025 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==2,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__f_age) # ADJUST GROUP
set.seed(390)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 14
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.1"
## gamma "0.5"
## max_depth "5"
## min_child_weight "10"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__f_age
res_rmse[res_rmse$Group==2,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab("Violent score - f(age)") +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(2728)
mdl_rf = randomForest(
formula = raw_score__f_age ~ .,
data = train
)
res_rmse[res_rmse$Group==2,]$rf = rmse(mdl_rf$predicted, train$raw_score__f_age) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__f_age ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 2
## type "eps-regression"
## cost "1"
## epsilon "0.5"
## gamma_scale "0.5"
## gamma "0.02941176"
res_rmse[res_rmse$Group==2,]$svm = rmse(mdl_svm$fitted, train$raw_score__f_age) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
### Create group 3 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_juv_fel_count,
p_felprop_violarrest,
p_murder_arrest,
p_felassault_arrest,
p_misdemassault_arrest,
#p_famviol_arrest,
p_sex_arrest,
p_weapons_arrest,
p_n_on_probation,
p_current_on_probation,
p_prob_revoke,
raw_score__f_age)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__f_age) %>% as.matrix(),
"label" = train %>% select(raw_score__f_age) %>% as.matrix()
)
mdl_lm = lm(raw_score__f_age ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__f_age ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8853 -0.3396 -0.1105 0.2422 3.8069
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7750803 0.0158841 48.796 < 2e-16 ***
## p_current_age 0.0140701 0.0009606 14.646 < 2e-16 ***
## p_age_first_offense -0.0195380 0.0010109 -19.327 < 2e-16 ***
## p_juv_fel_count 0.1216902 0.0193879 6.277 3.62e-10 ***
## p_felprop_violarrest 0.0902790 0.0062621 14.417 < 2e-16 ***
## p_murder_arrest 0.0807082 0.0476452 1.694 0.09031 .
## p_felassault_arrest 0.1480263 0.0115601 12.805 < 2e-16 ***
## p_misdemassault_arrest 0.1516864 0.0106318 14.267 < 2e-16 ***
## p_sex_arrest 0.1227168 0.0404793 3.032 0.00244 **
## p_weapons_arrest 0.0757266 0.0157582 4.806 1.57e-06 ***
## p_n_on_probation 0.0694142 0.0051413 13.501 < 2e-16 ***
## p_current_on_probation 0.1673132 0.0257716 6.492 8.91e-11 ***
## p_prob_revoke 0.0562168 0.0201996 2.783 0.00540 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4989 on 9029 degrees of freedom
## Multiple R-squared: 0.3358, Adjusted R-squared: 0.3349
## F-statistic: 380.3 on 12 and 9029 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==3,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__f_age) # ADJUST GROUP
set.seed(34)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 5
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__f_age
res_rmse[res_rmse$Group==3,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(Violent~score~-~f[viol~age])) +
ylab("XGBoost prediction") +
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
ggsave("Figures/raw-fage_xgboost_gp3_violent.pdf",width = 3, height = 3, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(7872)
mdl_rf = randomForest(
formula = raw_score__f_age ~ .,
data = train
)
res_rmse[res_rmse$Group==3,]$rf = rmse(mdl_rf$predicted, train$raw_score__f_age) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__f_age ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 11
## type "eps-regression"
## cost "1"
## epsilon "0.5"
## gamma_scale "1"
## gamma "0.07692308"
res_rmse[res_rmse$Group==3,]$svm = rmse(mdl_svm$fitted, train$raw_score__f_age) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
### Create group 4 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_juv_fel_count,
p_felprop_violarrest,
p_murder_arrest,
p_felassault_arrest,
p_misdemassault_arrest,
#p_famviol_arrest,
p_sex_arrest,
p_weapons_arrest,
p_n_on_probation,
p_current_on_probation,
p_prob_revoke,
race_black,
race_white,
race_hispanic,
race_asian,
race_native, # race == "Other" is the baseline
raw_score__f_age)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__f_age) %>% as.matrix(),
"label" = train %>% select(raw_score__f_age) %>% as.matrix()
)
mdl_lm = lm(raw_score__f_age ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__f_age ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9064 -0.3264 -0.1039 0.2364 3.6516
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5544869 0.0260945 21.249 < 2e-16 ***
## p_current_age 0.0135686 0.0009462 14.340 < 2e-16 ***
## p_age_first_offense -0.0169983 0.0010038 -16.934 < 2e-16 ***
## p_juv_fel_count 0.1127517 0.0190788 5.910 3.55e-09 ***
## p_felprop_violarrest 0.0876690 0.0061557 14.242 < 2e-16 ***
## p_murder_arrest 0.0839132 0.0468369 1.792 0.07323 .
## p_felassault_arrest 0.1451296 0.0113640 12.771 < 2e-16 ***
## p_misdemassault_arrest 0.1521143 0.0104506 14.556 < 2e-16 ***
## p_sex_arrest 0.1160088 0.0397845 2.916 0.00356 **
## p_weapons_arrest 0.0663196 0.0155042 4.278 1.91e-05 ***
## p_n_on_probation 0.0647496 0.0050606 12.795 < 2e-16 ***
## p_current_on_probation 0.1570690 0.0253391 6.199 5.94e-10 ***
## p_prob_revoke 0.0506041 0.0198557 2.549 0.01083 *
## race_black 0.2658235 0.0222736 11.934 < 2e-16 ***
## race_white 0.1089137 0.0225424 4.831 1.38e-06 ***
## race_hispanic 0.0308834 0.0270128 1.143 0.25295
## race_asian -0.0139690 0.0744987 -0.188 0.85127
## race_native 0.1331917 0.0951964 1.399 0.16181
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4903 on 9024 degrees of freedom
## Multiple R-squared: 0.3589, Adjusted R-squared: 0.3576
## F-statistic: 297.1 on 17 and 9024 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==4,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__f_age) # ADJUST GROUP
set.seed(11)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 6
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.1"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__f_age
res_rmse[res_rmse$Group==4,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(Violent~score~-~f[viol~age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=12),
axis.text=element_text(size=12))
ggsave("Figures/raw-fage_xgboost_gp4_violent.pdf",width = 3, height = 3, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
highlight = data.frame(
person_id= c(799, 1284, 1394, 1497, 1515, 1638, 3145, 3291, 5722, 6337, 6886, 7997, 8200, 8375, 8491, 10553, 10774, 11231, 11312, 11414),
screening_date = ymd(c("2014-06-15","2014-05-14","2014-11-28","2013-07-29","2013-10-23","2013-10-04","2014-12-14","2013-01-17","2013-10-24","2014-02-04","2013-07-12","2014-04-26","2014-05-05","2013-03-19","2014-01-18","2014-09-20","2013-04-09","2014-02-23","2014-05-02","2014-11-26")),
highlight = TRUE
)
df_plot = features_filt %>%
bind_cols(xgboost = predict(mdl_xgb, newdata=train_xgb)) %>%
left_join(highlight, by = c("person_id","screening_date")) %>%
mutate(highlight = if_else(is.na(highlight), FALSE, TRUE)) %>%
mutate(highlight = factor(if_else(highlight==TRUE,"In Table 5", "Not in Table 5"), levels=c("In Table 5", "Not in Table 5")))
person_id_text_topright = c(8491, 8375, 1497)
#person_id_text_topright = highlight$person_id
person_id_text_topleft = c(5722, 11231)
person_id_text_botright = c(799, 11312, 1284, 11414)
person_id_text_botleft = c()
ggplot() +
geom_point(aes(x=xgboost,y=raw_score, color=highlight), alpha = .3, data = filter(df_plot, highlight=="Not in Table 5")) +
geom_point(aes(x=xgboost,y=raw_score, color=highlight), data = filter(df_plot, highlight=="In Table 5")) +
theme_bw()+
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topright & highlight=="In Table 5")) +
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topleft & highlight=="In Table 5")) +
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botright & highlight=="In Table 5")) +
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botleft & highlight=="In Table 5")) +
xlab(expression(XGBoost~pred.~of~violent~score~-~f[age])) +
ylab("Violent score")+
theme(
text = element_text(size=12),
axis.text=element_text(size=12),
#legend.position = "top",
legend.position="none") +
scale_color_discrete(name = element_blank())
ggsave("Figures/xgboost_rawScore_annotate_violent.pdf",width = 3, height = 3, units = "in")
set.seed(379)
mdl_rf = randomForest(
formula = raw_score__f_age ~ .,
data = train
)
res_rmse[res_rmse$Group==4,]$rf = rmse(mdl_rf$predicted, train$raw_score__f_age) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__f_age ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 10
## type "eps-regression"
## cost "0.5"
## epsilon "0.5"
## gamma_scale "1"
## gamma "0.05555556"
res_rmse[res_rmse$Group==4,]$svm = rmse(mdl_svm$fitted, train$raw_score__f_age) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
knitr::kable(res_rmse)
| Group | lm | xgb | rf | svm |
|---|---|---|---|---|
| 1 | 0.5044421 | 0.4831389 | 0.4950131 | 0.4915281 |
| 2 | 0.4953577 | 0.4699155 | 0.4878036 | 0.4812358 |
| 3 | 0.4985543 | 0.4682046 | 0.4930407 | 0.4757220 |
| 4 | 0.4898085 | 0.4565221 | 0.4827517 | 0.4740605 |
We use the “Group 3” models but predict the raw score and the raw score minus all of the lower bounds we have fitted. Results from this section can be combined with Group 3 xgboost results above where we predicted the raw score minus the age lower bound only.
### Create group 3 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_juv_fel_count,
p_felprop_violarrest,
p_murder_arrest,
p_felassault_arrest,
p_misdemassault_arrest,
#p_famviol_arrest,
p_sex_arrest,
p_weapons_arrest,
p_n_on_probation,
p_current_on_probation,
p_prob_revoke,
raw_score)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score) %>% as.matrix(),
"label" = train %>% select(raw_score) %>% as.matrix()
)
set.seed(347)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 5
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score
print(paste("RMSE:",round(rmse(pred, actual),4)))
## [1] "RMSE: 0.465"
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab("Violent score") +
ylab("XGBoost prediction")+
#annotate("text", x = -3.5, y = 0.5, label = paste("RMSE:",round(rmse(pred, actual),4)))+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
ggsave("Figures/raw_xgboost_gp3_violent.pdf",width = 3, height = 3, units = "in")
### Create group 3 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_juv_fel_count,
p_felprop_violarrest,
p_murder_arrest,
p_felassault_arrest,
p_misdemassault_arrest,
#p_famviol_arrest,
p_sex_arrest,
p_weapons_arrest,
p_n_on_probation,
p_current_on_probation,
p_prob_revoke,
raw_score__f_age__g_vio_hist)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__f_age__g_vio_hist) %>% as.matrix(),
"label" = train %>% select(raw_score__f_age__g_vio_hist) %>% as.matrix()
)
set.seed(7301)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 13
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "0.5"
## max_depth "5"
## min_child_weight "10"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__f_age__g_vio_hist
print(paste("RMSE:",round(rmse(pred, actual),4)))
## [1] "RMSE: 0.4691"
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(Violent~score~-~f[viol~age]~-~g[viol~hist])) +
ylab("XGBoost prediction")+
#annotate("text", x = 0.5, y = 4, label = paste("RMSE:",round(rmse(pred, actual),4)))+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
ggsave("Figures/raw-fage-gviohist_xgboost_gp3_violent.pdf",width = 3, height = 3, units = "in")
propub = features_filt %>%
filter(filt4) %>% # Only people with valid recidivism values
mutate(age_low = if_else(p_current_age < 25,1,0),
age_high = if_else(p_current_age > 45,1,0),
female = if_else(sex=="Female",1,0),
n_priors = p_felony_count_person + p_misdem_count_person,
compas_high = if_else(`Risk of Violence_decile_score` >= 5, 1, 0), # Medium and High risk scores get +1 label
race = relevel(factor(race), ref="Caucasian")) # Base level is Caucasian, as in ProPublica analysis
print(paste("Number of observations for logistic regression:",nrow(propub)))
## [1] "Number of observations for logistic regression: 5759"
mdl_glm = glm(compas_high ~
female +
age_high +
age_low +
as.factor(race) +
p_charge +
is_misdem +
recid_violent,
family=binomial(link='logit'), data=propub)
summary(mdl_glm)
##
## Call:
## glm(formula = compas_high ~ female + age_high + age_low + as.factor(race) +
## p_charge + is_misdem + recid_violent, family = binomial(link = "logit"),
## data = propub)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6118 -0.6001 -0.3213 0.7395 2.9655
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.368882 0.094675 -25.021 < 2e-16 ***
## female -0.651197 0.099532 -6.543 6.05e-11 ***
## age_high -1.727458 0.174615 -9.893 < 2e-16 ***
## age_low 2.593419 0.079247 32.726 < 2e-16 ***
## as.factor(race)African-American 0.745976 0.083088 8.978 < 2e-16 ***
## as.factor(race)Asian -0.765381 0.678659 -1.128 0.259
## as.factor(race)Hispanic 0.005838 0.146006 0.040 0.968
## as.factor(race)Native American 0.503070 0.740727 0.679 0.497
## as.factor(race)Other -0.039456 0.170241 -0.232 0.817
## p_charge 0.074006 0.004375 16.915 < 2e-16 ***
## is_misdem -0.362223 0.077149 -4.695 2.67e-06 ***
## recid_violent 0.704532 0.093094 7.568 3.79e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7378.0 on 5758 degrees of freedom
## Residual deviance: 4934.2 on 5747 degrees of freedom
## AIC: 4958.2
##
## Number of Fisher Scoring iterations: 6